{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Logistic Regression Classifier for Text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Obtaining the IMDb movie review dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100% | 80.23 MB | 6.03 MB/s | 13.30 sec elapsed"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "import tarfile\n",
    "import time\n",
    "import urllib.request\n",
    "\n",
    "source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'\n",
    "target = 'aclImdb_v1.tar.gz'\n",
    "\n",
    "if os.path.exists(target):\n",
    "    os.remove(target)\n",
    "\n",
    "def reporthook(count, block_size, total_size):\n",
    "    global start_time\n",
    "    if count == 0:\n",
    "        start_time = time.time()\n",
    "        return\n",
    "    duration = time.time() - start_time\n",
    "    progress_size = int(count * block_size)\n",
    "    speed = progress_size / (1024.**2 * duration)\n",
    "    percent = count * block_size * 100. / total_size\n",
    "\n",
    "    sys.stdout.write(f'\\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB '\n",
    "                     f'| {speed:.2f} MB/s | {duration:.2f} sec elapsed')\n",
    "    sys.stdout.flush()\n",
    "\n",
    "\n",
    "if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'):\n",
    "    urllib.request.urlretrieve(source, target, reporthook)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.isdir('aclImdb'):\n",
    "\n",
    "    with tarfile.open(target, 'r:gz') as tar:\n",
    "        tar.extractall()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing the movie dataset into more convenient format"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install pyprind by uncommenting the next code cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting pyprind\n",
      "  Using cached PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)\n",
      "\u001b[33mWARNING: Error parsing requirements for soupsieve: [Errno 2] No such file or directory: '/Users/sebastian/miniforge3/lib/python3.10/site-packages/soupsieve-2.3.2.post1.dist-info/METADATA'\u001b[0m\u001b[33m\n",
      "\u001b[0m\u001b[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n",
      "\u001b[0mInstalling collected packages: pyprind\n",
      "Successfully installed pyprind-2.11.3\n"
     ]
    }
   ],
   "source": [
    "# !pip install pyprind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0% [##############################] 100% | ETA: 00:00:00\n",
      "Total time elapsed: 00:00:20\n"
     ]
    }
   ],
   "source": [
    "import pyprind\n",
    "import pandas as pd\n",
    "import os\n",
    "import sys\n",
    "from packaging import version\n",
    "\n",
    "\n",
    "# change the `basepath` to the directory of the\n",
    "# unzipped movie dataset\n",
    "\n",
    "basepath = 'aclImdb'\n",
    "\n",
    "labels = {'pos': 1, 'neg': 0}\n",
    "\n",
    "# if the progress bar does not show, change stream=sys.stdout to stream=2\n",
    "pbar = pyprind.ProgBar(50000, stream=sys.stdout)\n",
    "\n",
    "df = pd.DataFrame()\n",
    "for s in ('test', 'train'):\n",
    "    for l in ('pos', 'neg'):\n",
    "        path = os.path.join(basepath, s, l)\n",
    "        for file in sorted(os.listdir(path)):\n",
    "            with open(os.path.join(path, file), \n",
    "                      'r', encoding='utf-8') as infile:\n",
    "                txt = infile.read()\n",
    "                \n",
    "            if version.parse(pd.__version__) >= version.parse(\"1.3.2\"):\n",
    "                x = pd.DataFrame([[txt, labels[l]]], columns=['review', 'sentiment'])\n",
    "                df = pd.concat([df, x], ignore_index=False)\n",
    "\n",
    "            else:\n",
    "                df = df.append([[txt, labels[l]]], \n",
    "                               ignore_index=True)\n",
    "            pbar.update()\n",
    "df.columns = ['review', 'sentiment']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Shuffling the DataFrame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "\n",
    "if version.parse(pd.__version__) >= version.parse(\"1.3.2\"):\n",
    "    df = df.sample(frac=1, random_state=0).reset_index(drop=True)\n",
    "    \n",
    "else:\n",
    "    np.random.seed(0)\n",
    "    df = df.reindex(np.random.permutation(df.index))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Optional: Saving the assembled data as CSV file:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('movie_data.csv', index=False, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>review</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>In 1974, the teenager Martha Moxley (Maggie Gr...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>OK... so... I really like Kris Kristofferson a...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>***SPOILER*** Do not read this, if you think a...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              review  sentiment\n",
       "0  In 1974, the teenager Martha Moxley (Maggie Gr...          1\n",
       "1  OK... so... I really like Kris Kristofferson a...          0\n",
       "2  ***SPOILER*** Do not read this, if you think a...          0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('movie_data.csv', encoding='utf-8')\n",
    "\n",
    "# the following is necessary on some computers:\n",
    "df = df.rename(columns={\"0\": \"review\", \"1\": \"sentiment\"})\n",
    "\n",
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(50000, 2)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<br>\n",
    "<br>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training a logistic regression model for document classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Strip HTML and punctuation to speed up the GridSearch later:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = df.loc[:25000, 'review'].values\n",
    "y_train = df.loc[:25000, 'sentiment'].values\n",
    "X_test = df.loc[25000:, 'review'].values\n",
    "y_test = df.loc[25000:, 'sentiment'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem.porter import PorterStemmer\n",
    "\n",
    "porter = PorterStemmer()\n",
    "\n",
    "def tokenizer(text):\n",
    "    return text.split()\n",
    "\n",
    "\n",
    "def tokenizer_porter(text):\n",
    "    return [porter.stem(word) for word in text.split()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/sebastian/nltk_data...\n",
      "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['runner', 'like', 'run', 'run', 'lot']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk.corpus import stopwords\n",
    "\n",
    "stop = stopwords.words('english')\n",
    "[w for w in tokenizer_porter('a runner likes running and runs a lot')\n",
    " if w not in stop]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "tfidf = TfidfVectorizer(strip_accents=None,\n",
    "                        lowercase=False,\n",
    "                        preprocessor=None)\n",
    "\n",
    "small_param_grid = [{'vect__ngram_range': [(1, 1)],\n",
    "                     'vect__stop_words': [None],\n",
    "                     'vect__tokenizer': [tokenizer, tokenizer_porter],\n",
    "                     'clf__penalty': ['l2'],\n",
    "                     'clf__C': [1.0, 10.0]},\n",
    "                    {'vect__ngram_range': [(1, 1)],\n",
    "                     'vect__stop_words': [stop, None],\n",
    "                     'vect__tokenizer': [tokenizer],\n",
    "                     'vect__use_idf':[False],\n",
    "                     'vect__norm':[None],\n",
    "                     'clf__penalty': ['l2'],\n",
    "                  'clf__C': [1.0, 10.0]},\n",
    "              ]\n",
    "\n",
    "lr_tfidf = Pipeline([('vect', tfidf),\n",
    "                     ('clf', LogisticRegression(solver='liblinear'))])\n",
    "\n",
    "gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid,\n",
    "                           scoring='accuracy',\n",
    "                           cv=5,\n",
    "                           verbose=1,\n",
    "                           n_jobs=-1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Important Note about `n_jobs`**\n",
    "\n",
    "Please note that it is highly recommended to use `n_jobs=-1` (instead of `n_jobs=1`) in the previous code example to utilize all available cores on your machine and speed up the grid search. However, some Windows users reported issues when running the previous code with the `n_jobs=-1` setting related to pickling the tokenizer and tokenizer_porter functions for multiprocessing on Windows. Another workaround would be to replace those two functions, `[tokenizer, tokenizer_porter]`, with `[str.split]`. However, note that the replacement by the simple `str.split` would not support stemming."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 8 candidates, totalling 40 fits\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=5,\n",
       "             estimator=Pipeline(steps=[(&#x27;vect&#x27;,\n",
       "                                        TfidfVectorizer(lowercase=False)),\n",
       "                                       (&#x27;clf&#x27;,\n",
       "                                        LogisticRegression(solver=&#x27;liblinear&#x27;))]),\n",
       "             n_jobs=-1,\n",
       "             param_grid=[{&#x27;clf__C&#x27;: [1.0, 10.0], &#x27;clf__penalty&#x27;: [&#x27;l2&#x27;],\n",
       "                          &#x27;vect__ngram_range&#x27;: [(1, 1)],\n",
       "                          &#x27;vect__stop_words&#x27;: [None],\n",
       "                          &#x27;vect__tokenizer&#x27;: [&lt;function tokenizer at 0x168eab370&gt;,\n",
       "                                              &lt;function tokenizer_porter at 0x168eab2e0&gt;]},\n",
       "                         {&#x27;...\n",
       "                          &#x27;vect__stop_words&#x27;: [[&#x27;i&#x27;, &#x27;me&#x27;, &#x27;my&#x27;, &#x27;myself&#x27;, &#x27;we&#x27;,\n",
       "                                                &#x27;our&#x27;, &#x27;ours&#x27;, &#x27;ourselves&#x27;,\n",
       "                                                &#x27;you&#x27;, &quot;you&#x27;re&quot;, &quot;you&#x27;ve&quot;,\n",
       "                                                &quot;you&#x27;ll&quot;, &quot;you&#x27;d&quot;, &#x27;your&#x27;,\n",
       "                                                &#x27;yours&#x27;, &#x27;yourself&#x27;,\n",
       "                                                &#x27;yourselves&#x27;, &#x27;he&#x27;, &#x27;him&#x27;,\n",
       "                                                &#x27;his&#x27;, &#x27;himself&#x27;, &#x27;she&#x27;,\n",
       "                                                &quot;she&#x27;s&quot;, &#x27;her&#x27;, &#x27;hers&#x27;,\n",
       "                                                &#x27;herself&#x27;, &#x27;it&#x27;, &quot;it&#x27;s&quot;, &#x27;its&#x27;,\n",
       "                                                &#x27;itself&#x27;, ...],\n",
       "                                               None],\n",
       "                          &#x27;vect__tokenizer&#x27;: [&lt;function tokenizer at 0x168eab370&gt;],\n",
       "                          &#x27;vect__use_idf&#x27;: [False]}],\n",
       "             scoring=&#x27;accuracy&#x27;, verbose=1)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GridSearchCV</label><div class=\"sk-toggleable__content\"><pre>GridSearchCV(cv=5,\n",
       "             estimator=Pipeline(steps=[(&#x27;vect&#x27;,\n",
       "                                        TfidfVectorizer(lowercase=False)),\n",
       "                                       (&#x27;clf&#x27;,\n",
       "                                        LogisticRegression(solver=&#x27;liblinear&#x27;))]),\n",
       "             n_jobs=-1,\n",
       "             param_grid=[{&#x27;clf__C&#x27;: [1.0, 10.0], &#x27;clf__penalty&#x27;: [&#x27;l2&#x27;],\n",
       "                          &#x27;vect__ngram_range&#x27;: [(1, 1)],\n",
       "                          &#x27;vect__stop_words&#x27;: [None],\n",
       "                          &#x27;vect__tokenizer&#x27;: [&lt;function tokenizer at 0x168eab370&gt;,\n",
       "                                              &lt;function tokenizer_porter at 0x168eab2e0&gt;]},\n",
       "                         {&#x27;...\n",
       "                          &#x27;vect__stop_words&#x27;: [[&#x27;i&#x27;, &#x27;me&#x27;, &#x27;my&#x27;, &#x27;myself&#x27;, &#x27;we&#x27;,\n",
       "                                                &#x27;our&#x27;, &#x27;ours&#x27;, &#x27;ourselves&#x27;,\n",
       "                                                &#x27;you&#x27;, &quot;you&#x27;re&quot;, &quot;you&#x27;ve&quot;,\n",
       "                                                &quot;you&#x27;ll&quot;, &quot;you&#x27;d&quot;, &#x27;your&#x27;,\n",
       "                                                &#x27;yours&#x27;, &#x27;yourself&#x27;,\n",
       "                                                &#x27;yourselves&#x27;, &#x27;he&#x27;, &#x27;him&#x27;,\n",
       "                                                &#x27;his&#x27;, &#x27;himself&#x27;, &#x27;she&#x27;,\n",
       "                                                &quot;she&#x27;s&quot;, &#x27;her&#x27;, &#x27;hers&#x27;,\n",
       "                                                &#x27;herself&#x27;, &#x27;it&#x27;, &quot;it&#x27;s&quot;, &#x27;its&#x27;,\n",
       "                                                &#x27;itself&#x27;, ...],\n",
       "                                               None],\n",
       "                          &#x27;vect__tokenizer&#x27;: [&lt;function tokenizer at 0x168eab370&gt;],\n",
       "                          &#x27;vect__use_idf&#x27;: [False]}],\n",
       "             scoring=&#x27;accuracy&#x27;, verbose=1)</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[(&#x27;vect&#x27;, TfidfVectorizer(lowercase=False)),\n",
       "                (&#x27;clf&#x27;, LogisticRegression(solver=&#x27;liblinear&#x27;))])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">TfidfVectorizer</label><div class=\"sk-toggleable__content\"><pre>TfidfVectorizer(lowercase=False)</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(solver=&#x27;liblinear&#x27;)</pre></div></div></div></div></div></div></div></div></div></div></div></div>"
      ],
      "text/plain": [
       "GridSearchCV(cv=5,\n",
       "             estimator=Pipeline(steps=[('vect',\n",
       "                                        TfidfVectorizer(lowercase=False)),\n",
       "                                       ('clf',\n",
       "                                        LogisticRegression(solver='liblinear'))]),\n",
       "             n_jobs=-1,\n",
       "             param_grid=[{'clf__C': [1.0, 10.0], 'clf__penalty': ['l2'],\n",
       "                          'vect__ngram_range': [(1, 1)],\n",
       "                          'vect__stop_words': [None],\n",
       "                          'vect__tokenizer': [<function tokenizer at 0x168eab370>,\n",
       "                                              <function tokenizer_porter at 0x168eab2e0>]},\n",
       "                         {'...\n",
       "                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',\n",
       "                                                'our', 'ours', 'ourselves',\n",
       "                                                'you', \"you're\", \"you've\",\n",
       "                                                \"you'll\", \"you'd\", 'your',\n",
       "                                                'yours', 'yourself',\n",
       "                                                'yourselves', 'he', 'him',\n",
       "                                                'his', 'himself', 'she',\n",
       "                                                \"she's\", 'her', 'hers',\n",
       "                                                'herself', 'it', \"it's\", 'its',\n",
       "                                                'itself', ...],\n",
       "                                               None],\n",
       "                          'vect__tokenizer': [<function tokenizer at 0x168eab370>],\n",
       "                          'vect__use_idf': [False]}],\n",
       "             scoring='accuracy', verbose=1)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gs_lr_tfidf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x168eab370>}\n",
      "CV Accuracy: 0.887\n"
     ]
    }
   ],
   "source": [
    "print(f'Best parameter set: {gs_lr_tfidf.best_params_}')\n",
    "print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Accuracy: 0.893\n"
     ]
    }
   ],
   "source": [
    "clf = gs_lr_tfidf.best_estimator_\n",
    "print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
